### Merge together pieces of the analysis dataset
pacman::p_load(data.table, tidyverse, sf, cowplot, viridis, fst, DescTools, qs)


## Preliminaries
rm(list = ls())

source("code/globals.R")

# Load the main data ----

main <- qread(file.path(WORKING, "maindata.qs")) %>%
  dplyr::select(
    ImportParcelID, BuildingOrImprovementNumber, incidentid, FIPS, State,
    destroyed, combinedyear, UnformattedAssessorParcelNumber,
    PropertyHouseNumber, PropertyStreetName, PropertyStreetSuffix,
    PropertyZip, PropertyLandUseStndCode, LotSizeAcres, TotalBedrooms,
    TotalCalculatedBathCount, ImprovementAssessedValue, TotalAssessedValue, badrecordflag, howmanystructures, badstructuretype,
    incidentstartdate, incidentname, IncidentMergeRate, street, street25, 
    street25_byside, even_house_num, sqfeet, dist_to_perim, inDINS, APN_orig, address_orig, lon_orig, lat_orig, in_damage_data
  )


# Load spatial data  -----
# LANDFIRE, RPS, and census block
homepoints_spatial <- read_fst(file.path(WORKING, "homepoints-spatial.fst")) %>%
  select(ImportParcelID, BuildingOrImprovementNumber, incidentid, elev, slope, aspect, fuelmodel, s_sw, bp, flep, wildfire_hazard, block, tract, blockgroup, in_paradise)

# Load jurisdiction indicators ----
sra <- read_fst(file.path(WORKING, "sra.fst")) %>%
  select(ImportParcelID, BuildingOrImprovementNumber, incidentid, SRA, SRAcurrent, 
         hazclass_sra_85, hazclass_sra_07, hazclass_lra_98, hazclass_lra_08, vh_lra_98, vh_lra_08, regime)

# Load neighbors ----
neighbors <- read_fst(file.path(WORKING, "neighbor-vars.fst")) %>% rename(incidentid = UNIT_AND_NUMBER)

# Check that the non-neighbor datasets have the same number of rows as main -----
stopifnot(nrow(main) == nrow(homepoints_spatial) &
            nrow(main) == nrow(sra))

  
# Merge -----
data <- main %>%
  merge(homepoints_spatial, by = c("ImportParcelID", "BuildingOrImprovementNumber", "incidentid"), all = TRUE) %>%
  merge(sra, by = c("ImportParcelID", "BuildingOrImprovementNumber", "incidentid"), all = TRUE) %>%
  merge(neighbors, by = c("ImportParcelID", "BuildingOrImprovementNumber", "incidentid"), all = TRUE) 

stopifnot(nrow(data) == nrow(main))


# Remove bad structure records and non-single family homes -----

# Bad structure records
data <- data %>%
  mutate(badrecordflag = if_else(is.na(badrecordflag), FALSE, badrecordflag)) %>%
  filter(badrecordflag != TRUE)

# Non single-family homes
data <- data %>%
  filter(PropertyLandUseStndCode %in% c("RR101", "RR102"))

# Create fixed effects and bin variables  -----

data <- data %>%
  group_by(street, incidentid) %>%
  mutate(streetXfire = cur_group_id()) %>%
  group_by(street25, incidentid) %>%
  mutate(street25Xfire = cur_group_id()) %>%
  group_by(street25_byside, incidentid) %>%
  mutate(street25XfireXside = cur_group_id()) %>%
  group_by(street, even_house_num, incidentid) %>%
  mutate(streetXfireXside = cur_group_id()) %>%
  ungroup() %>%
  mutate(across(matches("street[[:digit:]]*Xfire"), factor))

# %>% mutate(across(matches("grid_id"), factor))

# Create some convenient year bins
data <- data %>%
  mutate(twoyearbin = cut(combinedyear,
    breaks = c(0, seq(1951, 2013, 2), 2020),
    labels = c("<1951", seq(1951, 2011, 2), "2013+"),
    right = FALSE, include.lowest = TRUE
  )) %>%
  mutate(fouryearbin = cut(combinedyear,
    breaks = c(0, seq(1951, 2011, 4), 2020),
    labels = c("<1951", seq(1951, 2007, 4), "2011+"),
    right = FALSE, include.lowest = TRUE
  )) %>%
  mutate(tenyearbin = cut(combinedyear,
    breaks = c(0, seq(1959, 2009, 10), 2020),
    labels = c("<1959", seq(1959, 1999, 10), "2009+"),
    right = FALSE, include.lowest = TRUE
  ))


# Create additional useful variables
data <- data %>%
  mutate(fireyear = year(incidentstartdate)) %>%
  ### "To code" variables by regime -- only used for neighbor regression
  # mutate(tocode = case_when(
  #   regime == "fra" ~ FALSE,
  #   regime == "sra" & combinedyear >= 1995 ~ TRUE,
  #   regime == "sra" & combinedyear < 1995 ~ FALSE,
  #   regime == "lraNN" ~ FALSE,
  #   regime == "lraYY" & combinedyear >= 1998 ~ TRUE,
  #   regime == "lraYY" & combinedyear < 1998 ~ FALSE,
  #   regime == "lraNY" & combinedyear >= 2008 ~ TRUE,
  #   regime == "lraNY" & combinedyear < 2008 ~ FALSE,
  #   regime == "lraYN" & combinedyear >= 1998 ~ TRUE,
  #   regime == "lraYN" & combinedyear < 1998 ~ FALSE,
  #   regime %in% c("otherstates", "localcodes") ~ FALSE,
  #   is.na(combinedyear) ~ NA # Homes w/out combined year are not determiend to be 
  # )) %>%
  ### Aggregated group and time period dummies
  mutate(treatmentgroup = case_when(
    regime %in% c("otherstates", "lraNN") ~ "No-codes",
    regime == "sra" ~ "SRA",
    regime %in% c("lraYY", "lraNY", "lraYN") ~ "LRA-VHFHSZ"
  )) %>%
  
  # mutate(treated = case_when(
  #   regime == "sra" & combinedyear < 1998 ~ 0,
  #   regime == "sra" & combinedyear >= 1998 ~ 1,
  #   regime %in% c("lraYY", "lraYN") & combinedyear < 1998 ~ 0,
  #   regime %in% c("lraYY", "lraYN") & combinedyear >= 1998 ~ 1,
  #   regime == "lraNY" & combinedyear < 2008 ~ 0,
  #   regime == "lraNY" & combinedyear >= 2008 ~ 1,
  #   regime %in% c("lraNN", "otherstates") ~ 0
  # )) %>%
  mutate(period = case_when(
    combinedyear < 1998 ~ 0,
    combinedyear >= 1998 & combinedyear < 2008 ~ 1,
    combinedyear >= 2008 ~ 2,
    is.na(combinedyear) ~ NA, # Homes missing year built are marked NA
  )) %>%
  mutate(
    period = factor(period),
    treatmentgroup = factor(treatmentgroup)
  ) %>%
  ### Aggregate thinly-observed fuel model categories
  mutate(fuelmodel = as.character(fuelmodel)) %>%
  group_by(fuelmodel) %>%
  mutate(fuelobs = n()) %>%
  ungroup() %>%
  mutate(fuelmodel = if_else(fuelobs < 1000, "other", fuelmodel)) %>%
  mutate(fuelmodel = factor(fuelmodel)) %>%
  select(-fuelobs) %>%
  ## calculate structure age at time of fire

  mutate(
    structureage = year(incidentstartdate) - combinedyear,
    structureagebin = cut(structureage, breaks = c(seq(0, 70, 10), 1000))
  )


# Bring in geocoding and geocoding source ----

loc_sources <- qread(file.path(WORKING, "zillowdata.qs")) %>%
  select(ImportParcelID, loc_source, lat, lon) %>%
  distinct(ImportParcelID, .keep_all = T)

modestatistic <- function(x) which.max(tabulate(x))
nrowbefore <- nrow(data)

data <- data %>%
  merge(loc_sources, by = "ImportParcelID", all.x = TRUE) %>%
  mutate(loc_source = factor(loc_source)) %>%
  group_by(incidentid) %>%
  mutate(
    dominantlocmethod = modestatistic(loc_source),
    dominantlocmethod = levels(loc_source)[dominantlocmethod]
  ) %>%
  ungroup()

stopifnot(nrow(data) == nrowbefore)
stopifnot(anyNA(data$dominantlocmethod) == 0)
stopifnot(anyNA(data$loc_source) == 0)

rm(modestatistic, nrowbefore)


### Additional data processing ----

startyear <- -9999 # Optional: Specify a cutoff year for inclusion -- another margin of robustness
endyear <- 9999

# Compute proportion destroyed by incident
data <- data %>%
  group_by(incidentid) %>%
  mutate(
    inc_destroyed = sum(destroyed) / n(),
    inc_destroyed_gt50 = inc_destroyed >= 0.5
  ) %>%
  ungroup()

# Add indicator if more than 1000 homes were at risk in a given incident
data <- data %>%
  group_by(incidentid) %>%
  mutate(
    inc_atrisk = n(),
    inc_atrisk_gt1000 = inc_atrisk >= 1000
  ) %>%
  ungroup()


# Flag if treatment status has changed between the time of treatment and the present day
data <- data %>% mutate(sameSRA = SRA == SRAcurrent)
data %>%
  count(sameSRA, SRA, SRAcurrent, regime) %>%
  mutate(pct = n / sum(n) * 100) %>%
  filter(SRA != SRAcurrent)
# Most changes are SRA homes becoming LRA homes

# Flag if homes were affected by >1 incident
data <- data %>% left_join(
  data %>%
    count(ImportParcelID, BuildingOrImprovementNumber) %>% transmute(ImportParcelID, BuildingOrImprovementNumber, multiple_incidents = n > 1)
)


# Square footage in thousands for display
data <- data %>% mutate(sqfeet1000 = sqfeet / 1000)

# Assessed value in 1000s and logs (regressions only)
data <- data %>%
  mutate(TotalAssessedValue1000 = TotalAssessedValue / 1000,
         log_assessed_value = log(TotalAssessedValue)
  )

# Winsorize, log lot size
data <- data %>%
  mutate(
    LotSizeAcresW = DescTools::Winsorize(LotSizeAcres, quantile(LotSizeAcres, probs = c(0.01, 0.99), na.rm = T)),
    logLotSize = log(LotSizeAcres)
  )



# Label resource constrained incidents
resource_constrained_incidents <- read_csv(file.path(INPUT_PRIVATE, "major-incidents-with-resource-constraints.csv"))
data <- data %>% mutate(inc_constrained = incidentname %in% resource_constrained_incidents$incidentname)

# For regressions, we drop local codes & FRA & 
# (And most regressions we also won't use lraNN)
data <- data %>% 
  mutate(sample_main = regime %in% c("sra", "lraYY", "lraYN", "lraNY", "otherstates") &
           complete.cases(period, destroyed, slope, fuelmodel),
         sample_CA = regime %in% c("sra", "lraYY", "lraYN", "lraNY", "lraNN") &
           complete.cases(period, destroyed, slope, fuelmodel))

data %>% count(period)

# Save analysis dataset ----

summary(data)

qsave(data, file.path(WORKING, "analysisdataset.qs"))
